library(dplyr)
library(ggplot2)
library(tidyr)
library(ggplot2)
library(reshape2)
library(data.table)
library(broom)
library(olsrr)
library(corrplot)
library(plotly)

Import data

setwd("D:/Project")
air <- read.csv("Air.csv", sep=";", dec=",")
air <- air[,-c(16, 17)]

Make it long and check data. We can see outliers in -200 - replace it with NA. We can see that one of columns - Non Metanic HydroCarbons concentration have a lot of NAs. Let’s drop it and then drop NAs. We saved data loosing one column. Also we can notice that not all columns have normal distribution

str(air)
## 'data.frame':    9471 obs. of  15 variables:
##  $ Date         : Factor w/ 392 levels "","01/01/2005",..: 116 116 116 116 116 116 129 129 129 129 ...
##  $ Time         : Factor w/ 25 levels "","00.00.00",..: 20 21 22 23 24 25 2 3 4 5 ...
##  $ CO.GT.       : num  2.6 2 2.2 2.2 1.6 1.2 1.2 1 0.9 0.6 ...
##  $ PT08.S1.CO.  : int  1360 1292 1402 1376 1272 1197 1185 1136 1094 1010 ...
##  $ NMHC.GT.     : int  150 112 88 80 51 38 31 31 24 19 ...
##  $ C6H6.GT.     : num  11.9 9.4 9 9.2 6.5 4.7 3.6 3.3 2.3 1.7 ...
##  $ PT08.S2.NMHC.: int  1046 955 939 948 836 750 690 672 609 561 ...
##  $ NOx.GT.      : int  166 103 131 172 131 89 62 62 45 -200 ...
##  $ PT08.S3.NOx. : int  1056 1174 1140 1092 1205 1337 1462 1453 1579 1705 ...
##  $ NO2.GT.      : int  113 92 114 122 116 96 77 76 60 -200 ...
##  $ PT08.S4.NO2. : int  1692 1559 1555 1584 1490 1393 1333 1333 1276 1235 ...
##  $ PT08.S5.O3.  : int  1268 972 1074 1203 1110 949 733 730 620 501 ...
##  $ T            : num  13.6 13.3 11.9 11 11.2 11.2 11.3 10.7 10.7 10.3 ...
##  $ RH           : num  48.9 47.7 54 60 59.6 59.2 56.8 60 59.7 60.2 ...
##  $ AH           : num  0.758 0.726 0.75 0.787 0.789 ...
for (i in 3:ncol(air)) {
  air[,i][which(air[,i] == -200)] <- NA
}
air_long <- melt(air)
## Warning in melt(air): The melt generic in data.table has been passed a
## data.frame and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(air). In the next version, this warning will become an error.
air_long <- air_long[,-c(1,2)]
ggplot(air_long, aes(value)) + 
  geom_histogram() + 
  facet_wrap(~variable, scales = "free")
## Warning: Removed 18183 rows containing non-finite values (stat_bin).

air <- air[,c(1:4,6:15)]
air <- drop_na(air)
air_long <- melt(air)
## Warning in melt(air): The melt generic in data.table has been passed a
## data.frame and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(air). In the next version, this warning will become an error.
air_long <- air_long[,-c(1,2)]
summary(air)
##          Date            Time          CO.GT.        PT08.S1.CO.  
##  02/04/2005:  24   10.00.00: 312   Min.   : 0.100   Min.   : 647  
##  03/04/2005:  24   20.00.00: 310   1st Qu.: 1.100   1st Qu.: 956  
##  15/03/2005:  24   09.00.00: 309   Median : 1.900   Median :1085  
##  16/03/2005:  24   12.00.00: 309   Mean   : 2.182   Mean   :1120  
##  18/03/2005:  24   18.00.00: 309   3rd Qu.: 2.900   3rd Qu.:1254  
##  19/03/2005:  24   21.00.00: 309   Max.   :11.900   Max.   :2040  
##  (Other)   :6797   (Other) :5083                                  
##     C6H6.GT.     PT08.S2.NMHC.       NOx.GT.        PT08.S3.NOx.   
##  Min.   : 0.20   Min.   : 390.0   Min.   :   2.0   Min.   : 322.0  
##  1st Qu.: 4.90   1st Qu.: 760.0   1st Qu.: 103.0   1st Qu.: 642.0  
##  Median : 8.80   Median : 931.0   Median : 186.0   Median : 786.0  
##  Mean   :10.55   Mean   : 958.5   Mean   : 250.7   Mean   : 816.9  
##  3rd Qu.:14.60   3rd Qu.:1135.0   3rd Qu.: 335.0   3rd Qu.: 947.0  
##  Max.   :63.70   Max.   :2214.0   Max.   :1479.0   Max.   :2683.0  
##                                                                    
##     NO2.GT.       PT08.S4.NO2.   PT08.S5.O3.         T               RH       
##  Min.   :  2.0   Min.   : 551   Min.   : 221   Min.   :-1.90   Min.   : 9.20  
##  1st Qu.: 79.0   1st Qu.:1207   1st Qu.: 760   1st Qu.:11.20   1st Qu.:35.30  
##  Median :110.0   Median :1457   Median :1006   Median :16.80   Median :49.20  
##  Mean   :113.9   Mean   :1453   Mean   :1058   Mean   :17.76   Mean   :48.88  
##  3rd Qu.:142.0   3rd Qu.:1683   3rd Qu.:1322   3rd Qu.:23.70   3rd Qu.:62.20  
##  Max.   :333.0   Max.   :2775   Max.   :2523   Max.   :44.60   Max.   :88.70  
##                                                                               
##        AH        
##  Min.   :0.1847  
##  1st Qu.:0.6941  
##  Median :0.9539  
##  Mean   :0.9856  
##  3rd Qu.:1.2516  
##  Max.   :2.1806  
## 

So, we need to normolize our data.

normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
air_norm <- as.data.frame(lapply(air[3:14], normalize))
air_long <- melt(air_norm)
## Warning in melt(air_norm): The melt generic in data.table has been passed
## a data.frame and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(air_norm). In the next version, this warning will become an
## error.
## No id variables; using all as measure variables

Check our data. Quantile - the proportion of cases that are less than certain values. If the requirements of “normality” are here - the data should lie on a diagonal line. We can see that several columns do not have normal distribution.

#CO.GT
qqnorm(air_norm$CO.GT., pch = 1, frame = FALSE)
qqline(air_norm$CO.GT., col = "steelblue", lwd = 2)

#PT08.S1.CO
qqnorm(air_norm$PT08.S1.CO., pch = 1, frame = FALSE)
qqline(air_norm$PT08.S1.CO., col = "steelblue", lwd = 2)

#C6H6.GT
qqnorm(air_norm$C6H6.GT., pch = 1, frame = FALSE)
qqline(air_norm$C6H6.GT., col = "steelblue", lwd = 2)

#PT08.S2.NMHC
qqnorm(air_norm$PT08.S2.NMHC., pch = 1, frame = FALSE)
qqline(air_norm$PT08.S2.NMHC., col = "steelblue", lwd = 2)

#NOx.GT
qqnorm(air_norm$NOx.GT., pch = 1, frame = FALSE)
qqline(air_norm$NOx.GT., col = "steelblue", lwd = 2)

#PT08.S3.NOx
qqnorm(air_norm$PT08.S3.NOx., pch = 1, frame = FALSE)
qqline(air_norm$PT08.S3.NOx., col = "steelblue", lwd = 2)

#NO2.GT
qqnorm(air_norm$NO2.GT., pch = 1, frame = FALSE)
qqline(air_norm$NO2.GT., col = "steelblue", lwd = 2)

#PT08.S4.NO2
qqnorm(air_norm$PT08.S4.NO2., pch = 1, frame = FALSE)
qqline(air_norm$PT08.S4.NO2., col = "steelblue", lwd = 2)

#PT08.S5.O3
qqnorm(air_norm$PT08.S5.O3., pch = 1, frame = FALSE)
qqline(air_norm$PT08.S5.O3., col = "steelblue", lwd = 2)

#T
qqnorm(air_norm$T, pch = 1, frame = FALSE)
qqline(air_norm$T, col = "steelblue", lwd = 2)

#RH
qqnorm(air_norm$RH, pch = 1, frame = FALSE)
qqline(air_norm$RH, col = "steelblue", lwd = 2)

#AH
qqnorm(air_norm$AH, pch = 1, frame = FALSE)
qqline(air_norm$AH, col = "steelblue", lwd = 2)

Response C6H6.GT. We can see columns that have linear response to C6H6.GT.

mod <- lm(data = air_norm, C6H6.GT. ~ CO.GT.)
summary(mod)
## 
## Call:
## lm(formula = C6H6.GT. ~ CO.GT., data = air_norm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.23935 -0.02496 -0.00249  0.02357  0.76733 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.0050753  0.0009115   5.568 2.67e-08 ***
## CO.GT.      0.8952134  0.0042471 210.782  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04321 on 6939 degrees of freedom
## Multiple R-squared:  0.8649, Adjusted R-squared:  0.8649 
## F-statistic: 4.443e+04 on 1 and 6939 DF,  p-value: < 2.2e-16
augment(mod)%>%
   ggplot(aes(x = CO.GT., y = C6H6.GT.))+
  geom_point()+
  geom_line(aes(y= .fitted), color = "blue", size = 1)

residuals(mod) %>% hist(main = "Residuals CO.GT.")

plot(mod, which = 1)

mod1 <- lm(data = air_norm, C6H6.GT. ~ PT08.S1.CO.)
summary(mod1)
## 
## Call:
## lm(formula = C6H6.GT. ~ PT08.S1.CO., data = air_norm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18826 -0.03559 -0.00288  0.03079  0.70768 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.059959   0.001613  -37.18   <2e-16 ***
## PT08.S1.CO.  0.656927   0.004312  152.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0564 on 6939 degrees of freedom
## Multiple R-squared:  0.7699, Adjusted R-squared:  0.7699 
## F-statistic: 2.322e+04 on 1 and 6939 DF,  p-value: < 2.2e-16
augment(mod1)%>%
   ggplot(aes(x = PT08.S1.CO., y = C6H6.GT.))+
  geom_point()+
  geom_line(aes(y= .fitted), color = "blue", size = 1)

residuals(mod1) %>% hist(main = "Residuals PT08.S1.CO.")

plot(mod1, which = 1)

mod2 <- lm(data = air_norm, C6H6.GT. ~ PT08.S2.NMHC.)
summary(mod2)
## 
## Call:
## lm(formula = C6H6.GT. ~ PT08.S2.NMHC., data = air_norm)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.018385 -0.015140 -0.007871  0.007973  0.287650 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.0856861  0.0006204  -138.1   <2e-16 ***
## PT08.S2.NMHC.  0.7980363  0.0018053   442.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02177 on 6939 degrees of freedom
## Multiple R-squared:  0.9657, Adjusted R-squared:  0.9657 
## F-statistic: 1.954e+05 on 1 and 6939 DF,  p-value: < 2.2e-16
augment(mod2)%>%
   ggplot(aes(x = PT08.S2.NMHC., y = C6H6.GT.))+
  geom_point()+
  geom_line(aes(y= .fitted), color = "blue", size = 1)

residuals(mod2) %>% hist(main = "Residuals PT08.S2.NMHC.")

plot(mod2, which = 1)

mod3 <- lm(data = air_norm, C6H6.GT. ~ NOx.GT.)
summary(mod3)
## 
## Call:
## lm(formula = C6H6.GT. ~ NOx.GT., data = air_norm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.20002 -0.06256 -0.01753  0.04394  0.61415 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.062395   0.001528   40.84   <2e-16 ***
## NOx.GT.     0.597921   0.006951   86.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08179 on 6939 degrees of freedom
## Multiple R-squared:  0.516,  Adjusted R-squared:  0.5159 
## F-statistic:  7398 on 1 and 6939 DF,  p-value: < 2.2e-16
augment(mod3)%>%
   ggplot(aes(x = NOx.GT., y = C6H6.GT.))+
  geom_point()+
  geom_line(aes(y= .fitted), color = "blue", size = 1)

residuals(mod3) %>% hist(main = "Residuals NOx.GT.")

plot(mod3, which = 1)

mod4 <- lm(data = air_norm, C6H6.GT. ~ PT08.S3.NOx.)
summary(mod4)
## 
## Call:
## lm(formula = C6H6.GT. ~ PT08.S3.NOx., data = air_norm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.14285 -0.05823 -0.01275  0.03632  0.69946 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.330684   0.002140  154.49   <2e-16 ***
## PT08.S3.NOx. -0.799672   0.009101  -87.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08089 on 6939 degrees of freedom
## Multiple R-squared:  0.5267, Adjusted R-squared:  0.5266 
## F-statistic:  7721 on 1 and 6939 DF,  p-value: < 2.2e-16
augment(mod4)%>%
   ggplot(aes(x = PT08.S3.NOx., y = C6H6.GT.))+
  geom_point()+
  geom_line(aes(y= .fitted), color = "blue", size = 1)

residuals(mod4) %>% hist(main = "Residuals PT08.S3.NOx.")

plot(mod4, which = 1)

mod5 <- lm(data = air_norm, C6H6.GT. ~ NO2.GT.)
summary(mod5)
## 
## Call:
## lm(formula = C6H6.GT. ~ NO2.GT., data = air_norm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28123 -0.05625 -0.00966  0.04071  0.68139 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.004056   0.002881  -1.408    0.159    
## NO2.GT.      0.494451   0.007848  63.005   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09377 on 6939 degrees of freedom
## Multiple R-squared:  0.3639, Adjusted R-squared:  0.3638 
## F-statistic:  3970 on 1 and 6939 DF,  p-value: < 2.2e-16
augment(mod5)%>%
   ggplot(aes(x = NO2.GT., y = C6H6.GT.))+
  geom_point()+
  geom_line(aes(y= .fitted), color = "blue", size = 1)

residuals(mod5) %>% hist(main = "Residuals NO2.GT.")

plot(mod5, which = 1)

mod6 <- lm(data = air_norm, C6H6.GT. ~ PT08.S4.NO2.)
summary(mod6)
## 
## Call:
## lm(formula = C6H6.GT. ~ PT08.S4.NO2., data = air_norm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.15656 -0.05635 -0.00724  0.04597  0.79223 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.065500   0.002506  -26.14   <2e-16 ***
## PT08.S4.NO2.  0.563771   0.005755   97.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07616 on 6939 degrees of freedom
## Multiple R-squared:  0.5803, Adjusted R-squared:  0.5803 
## F-statistic:  9596 on 1 and 6939 DF,  p-value: < 2.2e-16
augment(mod6)%>%
   ggplot(aes(x = PT08.S4.NO2., y = C6H6.GT.))+
  geom_point()+
  geom_line(aes(y= .fitted), color = "blue", size = 1)

residuals(mod6) %>% hist(main = "Residuals PT08.S4.NO2.")

plot(mod6, which = 1)

mod7 <- lm(data = air_norm,C6H6.GT.  ~ PT08.S5.O3.)
summary(mod7)
## 
## Call:
## lm(formula = C6H6.GT. ~ PT08.S5.O3., data = air_norm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24981 -0.03674 -0.00059  0.03326  0.52482 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.045328   0.001642  -27.61   <2e-16 ***
## PT08.S5.O3.  0.573303   0.004063  141.11   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05977 on 6939 degrees of freedom
## Multiple R-squared:  0.7416, Adjusted R-squared:  0.7415 
## F-statistic: 1.991e+04 on 1 and 6939 DF,  p-value: < 2.2e-16
augment(mod7)%>%
   ggplot(aes(x = PT08.S5.O3., y = C6H6.GT.))+
  geom_point()+
  geom_line(aes(y= .fitted), color = "blue", size = 1)

residuals(mod7) %>% hist(main = "Residuals PT08.S5.O3.")

plot(mod7, which = 1)

mod8 <- lm(data = air_norm,C6H6.GT. ~ `T`)
summary(mod8)
## 
## Call:
## lm(formula = C6H6.GT. ~ T, data = air_norm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.17626 -0.08503 -0.02819  0.05791  0.86597 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.113686   0.003377   33.66   <2e-16 ***
## T           0.116814   0.007286   16.03   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1155 on 6939 degrees of freedom
## Multiple R-squared:  0.03572,    Adjusted R-squared:  0.03558 
## F-statistic: 257.1 on 1 and 6939 DF,  p-value: < 2.2e-16
augment(mod8)%>%
   ggplot(aes(x = `T`, y = C6H6.GT.))+
  geom_point()+
  geom_line(aes(y= .fitted), color = "blue", size = 1)

residuals(mod8) %>% hist(main = "Residuals T")

plot(mod8, which = 1)

mod9 <- lm(data = air_norm, C6H6.GT. ~ RH)
summary(mod9)
## 
## Call:
## lm(formula = C6H6.GT. ~ RH, data = air_norm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16472 -0.08857 -0.02776  0.06256  0.83736 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.168841   0.003508  48.131   <2e-16 ***
## RH          -0.011576   0.006434  -1.799   0.0721 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1175 on 6939 degrees of freedom
## Multiple R-squared:  0.0004662,  Adjusted R-squared:  0.0003221 
## F-statistic: 3.236 on 1 and 6939 DF,  p-value: 0.07206
augment(mod9)%>%
   ggplot(aes(x = RH, y = C6H6.GT.))+
  geom_point()+
  geom_line(aes(y= .fitted), color = "blue", size = 1)

residuals(mod9) %>% hist(main = "Residuals RH")

plot(mod9, which = 1)

mod10 <- lm(data = air_norm, C6H6.GT. ~ AH)
summary(mod10)
## 
## Call:
## lm(formula = C6H6.GT. ~ AH, data = air_norm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.19607 -0.08744 -0.02681  0.06081  0.86382 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.119150   0.003096   38.49   <2e-16 ***
## AH          0.109438   0.006899   15.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1155 on 6939 degrees of freedom
## Multiple R-squared:  0.035,  Adjusted R-squared:  0.03486 
## F-statistic: 251.6 on 1 and 6939 DF,  p-value: < 2.2e-16
augment(mod10)%>%
   ggplot(aes(x = AH, y = C6H6.GT.))+
  geom_point()+
  geom_line(aes(y= .fitted), color = "blue", size = 1)

residuals(mod10) %>% hist(main = "Residuals AH")

plot(mod10, which = 1)

pairs(air_norm[,sapply(air_norm, is.double)])

Check correlation.

corrplot.mixed(cor(air_norm, method = "kendall"), number.cex = .7)

Check multicolinearity. I decided to see what’s going on by hands. For multic. we need to have CI>30 and VIF>10. Also, we can see it when coeficients of columns alone are not significant but together they give significant p-value.

mod <-  lm(C6H6.GT.~PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx. + NO2.GT. + PT08.S4.NO2., data = air_norm)
summary(mod)
## 
## Call:
## lm(formula = C6H6.GT. ~ PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx. + 
##     NO2.GT. + PT08.S4.NO2., data = air_norm)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.107541 -0.011598 -0.003860  0.007867  0.273775 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.128346   0.001736 -73.928  < 2e-16 ***
## PT08.S2.NMHC.  0.811767   0.005821 139.463  < 2e-16 ***
## NOx.GT.        0.104145   0.003158  32.974  < 2e-16 ***
## PT08.S3.NOx.   0.130556   0.003545  36.827  < 2e-16 ***
## NO2.GT.       -0.042904   0.002939 -14.597  < 2e-16 ***
## PT08.S4.NO2.   0.019687   0.003667   5.369 8.19e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01863 on 6935 degrees of freedom
## Multiple R-squared:  0.9749, Adjusted R-squared:  0.9749 
## F-statistic: 5.389e+04 on 5 and 6935 DF,  p-value: < 2.2e-16
X <- model.matrix(~PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx. + NO2.GT. + PT08.S4.NO2., data = air_norm)
XX <- t(X) %*% X
eigen <- eigen(XX)
CI <- sqrt(max(eigen$values) / min(eigen$values))
CI
## [1] 38.71032
# Also can count it via formula 1/1-R^2
ols_coll_diag(mod)
## Tolerance and Variance Inflation Factor
## ---------------------------------------
##       Variables  Tolerance       VIF
## 1 PT08.S2.NMHC. 0.07043174 14.198143
## 2       NOx.GT. 0.25131144  3.979126
## 3  PT08.S3.NOx. 0.34956090  2.860732
## 4       NO2.GT. 0.28139450  3.553730
## 5  PT08.S4.NO2. 0.14737223  6.785539
## 
## 
## Eigenvalue and Condition Index
## ------------------------------
##    Eigenvalue Condition Index    intercept PT08.S2.NMHC.     NOx.GT.
## 1 5.139259035        1.000000 0.0005983670  4.445038e-04 0.002970002
## 2 0.598258522        2.930932 0.0017127027  1.080741e-03 0.053610656
## 3 0.182088746        5.312619 0.0001264032  9.057705e-03 0.130894451
## 4 0.059729372        9.275905 0.0029993206  8.942371e-06 0.506597242
## 5 0.013526138       19.492311 0.8122319948  1.333775e-01 0.028926663
## 6 0.007138187       26.832201 0.1823312118  8.560306e-01 0.277000986
##   PT08.S3.NOx.     NO2.GT. PT08.S4.NO2.
## 1  0.001760862 0.001513613 6.783985e-04
## 2  0.065813839 0.002081996 3.339913e-05
## 3  0.042990083 0.018947690 4.823296e-02
## 4  0.079221335 0.432104143 1.009362e-02
## 5  0.806637576 0.118860131 6.737641e-04
## 6  0.003576305 0.426492427 9.402879e-01
residuals(mod) %>% hist(main = "Residuals PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx. + NO2.GT. + PT08.S4.NO2.")

plot(mod, which = 1)

mod <-  lm(C6H6.GT.~PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx. + NO2.GT., data = air_norm)
summary(mod)
## 
## Call:
## lm(formula = C6H6.GT. ~ PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx. + 
##     NO2.GT., data = air_norm)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.109097 -0.011633 -0.003863  0.007860  0.262061 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.124639   0.001596  -78.09   <2e-16 ***
## PT08.S2.NMHC.  0.839425   0.002714  309.24   <2e-16 ***
## NOx.GT.        0.095550   0.002728   35.03   <2e-16 ***
## PT08.S3.NOx.   0.130176   0.003552   36.65   <2e-16 ***
## NO2.GT.       -0.051248   0.002500  -20.50   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01867 on 6936 degrees of freedom
## Multiple R-squared:  0.9748, Adjusted R-squared:  0.9748 
## F-statistic: 6.708e+04 on 4 and 6936 DF,  p-value: < 2.2e-16
X <- model.matrix(~~PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx. + NO2.GT., data = air_norm)
XX <- t(X) %*% X
eigen <- eigen(XX)
CI <- sqrt(max(eigen$values) / min(eigen$values))
CI
## [1] 21.89312
# Also can count it via formula 1/1-R^2
ols_coll_diag(mod)
## Tolerance and Variance Inflation Factor
## ---------------------------------------
##       Variables Tolerance      VIF
## 1 PT08.S2.NMHC. 0.3251437 3.075563
## 2       NOx.GT. 0.3382152 2.956697
## 3  PT08.S3.NOx. 0.3497007 2.859588
## 4       NO2.GT. 0.3906213 2.560025
## 
## 
## Eigenvalue and Condition Index
## ------------------------------
##   Eigenvalue Condition Index    intercept PT08.S2.NMHC.     NOx.GT.
## 1 4.24470560        1.000000 0.0010378468   0.002942509 0.006032574
## 2 0.59765062        2.665018 0.0021545418   0.004855400 0.069713658
## 3 0.09187354        6.797177 0.0068155859   0.274595438 0.640598722
## 4 0.05225333        9.012946 0.0001823467   0.190118070 0.251256640
## 5 0.01351690       17.720879 0.9898096788   0.527488583 0.032398405
##   PT08.S3.NOx.      NO2.GT.
## 1  0.002602851 0.0031323195
## 2  0.067442792 0.0025800414
## 3  0.085012910 0.0006553564
## 4  0.035136354 0.8073370395
## 5  0.809805092 0.1862952433
residuals(mod) %>% hist(main = "Residuals PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx. + NO2.GT.")

plot(mod, which = 1)

mod <-  lm(C6H6.GT.~PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx., data = air_norm)
summary(mod)
## 
## Call:
## lm(formula = C6H6.GT. ~ PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx., 
##     data = air_norm)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.119722 -0.011908 -0.004171  0.007868  0.261766 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.139037   0.001476  -94.20   <2e-16 ***
## PT08.S2.NMHC.  0.836180   0.002791  299.64   <2e-16 ***
## NOx.GT.        0.065862   0.002381   27.66   <2e-16 ***
## PT08.S3.NOx.   0.144898   0.003582   40.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01922 on 6937 degrees of freedom
## Multiple R-squared:  0.9733, Adjusted R-squared:  0.9733 
## F-statistic: 8.421e+04 on 3 and 6937 DF,  p-value: < 2.2e-16
X <- model.matrix(~PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx., data = air_norm)
XX <- t(X) %*% X
eigen <- eigen(XX)
CI <- sqrt(max(eigen$values) / min(eigen$values))
CI
## [1] 20.35276
# Also can count it via formula 1/1-R^2
ols_coll_diag(mod)
## Tolerance and Variance Inflation Factor
## ---------------------------------------
##       Variables Tolerance      VIF
## 1 PT08.S2.NMHC. 0.3262530 3.065106
## 2       NOx.GT. 0.4708927 2.123626
## 3  PT08.S3.NOx. 0.3646055 2.742691
## 
## 
## Eigenvalue and Condition Index
## ------------------------------
##   Eigenvalue Condition Index   intercept PT08.S2.NMHC.     NOx.GT. PT08.S3.NOx.
## 1 3.31385779        1.000000 0.002141679   0.004788272 0.013280086   0.00473510
## 2 0.57863661        2.393117 0.001798209   0.008081361 0.123688850   0.06594081
## 3 0.09181841        6.007614 0.008878405   0.287030099 0.858694945   0.08654646
## 4 0.01568719       14.534308 0.987181706   0.700100268 0.004336119   0.84277763
residuals(mod) %>% hist(main = "Residuals PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx.")

plot(mod, which = 1)

mod <-  lm(C6H6.GT.~PT08.S2.NMHC. + NOx.GT., data = air_norm)
summary(mod)
## 
## Call:
## lm(formula = C6H6.GT. ~ PT08.S2.NMHC. + NOx.GT., data = air_norm)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.030226 -0.013185 -0.007315  0.008054  0.291866 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.0837611  0.0006203 -135.03   <2e-16 ***
## PT08.S2.NMHC.  0.7693267  0.0024996  307.77   <2e-16 ***
## NOx.GT.        0.0417186  0.0025621   16.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02137 on 6938 degrees of freedom
## Multiple R-squared:  0.967,  Adjusted R-squared:  0.967 
## F-statistic: 1.016e+05 on 2 and 6938 DF,  p-value: < 2.2e-16
X <- model.matrix(~PT08.S2.NMHC. + NOx.GT., data = air_norm)
XX <- t(X) %*% X
eigen <- eigen(XX)
CI <- sqrt(max(eigen$values) / min(eigen$values))
CI
## [1] 13.75912
# Also can count it via formula 1/1-R^2
ols_coll_diag(mod)
## Tolerance and Variance Inflation Factor
## ---------------------------------------
##       Variables Tolerance      VIF
## 1 PT08.S2.NMHC. 0.5024684 1.990175
## 2       NOx.GT. 0.5024684 1.990175
## 
## 
## Eigenvalue and Condition Index
## ------------------------------
##   Eigenvalue Condition Index intercept PT08.S2.NMHC.    NOx.GT.
## 1 2.70739657        1.000000 0.0205363    0.01170120 0.02448443
## 2 0.23464691        3.396789 0.3316642    0.00105034 0.47940527
## 3 0.05795652        6.834784 0.6477995    0.98724846 0.49611030
residuals(mod) %>% hist(main = "Residuals PT08.S2.NMHC. + NOx.GT.")

plot(mod, which = 1)

mod <-  lm(C6H6.GT.~PT08.S2.NMHC. + PT08.S3.NOx. + NO2.GT. + PT08.S4.NO2., data = air_norm)
summary(mod)
## 
## Call:
## lm(formula = C6H6.GT. ~ PT08.S2.NMHC. + PT08.S3.NOx. + NO2.GT. + 
##     PT08.S4.NO2., data = air_norm)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.100634 -0.012916 -0.004665  0.008467  0.236178 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.121785   0.001855 -65.659   <2e-16 ***
## PT08.S2.NMHC.  0.921669   0.005132 179.591   <2e-16 ***
## PT08.S3.NOx.   0.119222   0.003795  31.418   <2e-16 ***
## NO2.GT.       -0.031244   0.003138  -9.957   <2e-16 ***
## PT08.S4.NO2.  -0.041606   0.003399 -12.239   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02004 on 6936 degrees of freedom
## Multiple R-squared:  0.971,  Adjusted R-squared:  0.971 
## F-statistic: 5.8e+04 on 4 and 6936 DF,  p-value: < 2.2e-16
X <- model.matrix(~PT08.S2.NMHC. + PT08.S3.NOx. + NO2.GT. + PT08.S4.NO2., data = air_norm)
XX <- t(X) %*% X
eigen <- eigen(XX)
CI <- sqrt(max(eigen$values) / min(eigen$values))
CI
## [1] 32.21519
# Also can count it via formula 1/1-R^2
ols_coll_diag(mod)
## Tolerance and Variance Inflation Factor
## ---------------------------------------
##       Variables Tolerance      VIF
## 1 PT08.S2.NMHC. 0.1047905 9.542848
## 2  PT08.S3.NOx. 0.3528782 2.833839
## 3       NO2.GT. 0.2855272 3.502294
## 4  PT08.S4.NO2. 0.1983337 5.042007
## 
## 
## Eigenvalue and Condition Index
## ------------------------------
##    Eigenvalue Condition Index    intercept PT08.S2.NMHC. PT08.S3.NOx.
## 1 4.427747401        1.000000 0.0008376743  0.0008636789 2.645637e-03
## 2 0.423998323        3.231539 0.0012041015  0.0082522657 1.143625e-01
## 3 0.124698738        5.958822 0.0006010036  0.0032457337 2.638710e-06
## 4 0.014195851       17.660814 0.6578017378  0.3348182692 8.546539e-01
## 5 0.009359687       21.750075 0.3395554828  0.6528200525 2.833534e-02
##       NO2.GT. PT08.S4.NO2.
## 1 0.001995892  0.001248737
## 2 0.007652835  0.002431767
## 3 0.192335418  0.089591086
## 4 0.006474433  0.036506707
## 5 0.791541422  0.870221703
residuals(mod) %>% hist(main = "Residuals PT08.S2.NMHC. + PT08.S3.NOx. + NO2.GT. + PT08.S4.NO2.")

plot(mod, which = 1)

mod <-  lm(C6H6.GT.~ PT08.S3.NOx. + NO2.GT. + PT08.S4.NO2., data = air_norm)
summary(mod)
## 
## Call:
## lm(formula = C6H6.GT. ~ PT08.S3.NOx. + NO2.GT. + PT08.S4.NO2., 
##     data = air_norm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.14144 -0.03141 -0.00723  0.02376  0.66451 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.121058   0.004409  -27.46   <2e-16 ***
## PT08.S3.NOx. -0.132803   0.008380  -15.85   <2e-16 ***
## NO2.GT.       0.356494   0.005413   65.86   <2e-16 ***
## PT08.S4.NO2.  0.472271   0.004363  108.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04762 on 6937 degrees of freedom
## Multiple R-squared:  0.836,  Adjusted R-squared:  0.8359 
## F-statistic: 1.179e+04 on 3 and 6937 DF,  p-value: < 2.2e-16
X <- model.matrix(~PT08.S3.NOx. + NO2.GT. + PT08.S4.NO2., data = air_norm)
XX <- t(X) %*% X
eigen <- eigen(XX)
CI <- sqrt(max(eigen$values) / min(eigen$values))
CI
## [1] 21.17658
# Also can count it via formula 1/1-R^2
ols_coll_diag(mod)
## Tolerance and Variance Inflation Factor
## ---------------------------------------
##      Variables Tolerance      VIF
## 1 PT08.S3.NOx. 0.4087832 2.446285
## 2      NO2.GT. 0.5421633 1.844463
## 3 PT08.S4.NO2. 0.6803347 1.469865
## 
## 
## Eigenvalue and Condition Index
## ------------------------------
##   Eigenvalue Condition Index    intercept PT08.S3.NOx.    NO2.GT. PT08.S4.NO2.
## 1  3.5420210        1.000000 0.0013271527  0.005260668 0.00574431  0.006463796
## 2  0.3258237        3.297116 0.0001216359  0.168760664 0.05972031  0.030188426
## 3  0.1200822        5.431080 0.0001372180  0.004741379 0.29291437  0.427565656
## 4  0.0120730       17.128449 0.9984139934  0.821237288 0.64162101  0.535782122
residuals(mod) %>% hist(main = "Residuals PT08.S3.NOx. + NO2.GT. + PT08.S4.NO2.")

plot(mod, which = 1)

Final model Residuals should be normally distributed and the Q-Q Plot will show this. If residuals follow close to a straight line on this plot, it is a good indication they are normally distributed.

# Good model but NOx.GT. not normally distributed
# I've decided not to transform data, because I don't want to see log-log transformation
mod <-  lm(C6H6.GT.~PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx. + NO2.GT., data = air_norm)
summary(mod)
## 
## Call:
## lm(formula = C6H6.GT. ~ PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx. + 
##     NO2.GT., data = air_norm)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.109097 -0.011633 -0.003863  0.007860  0.262061 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.124639   0.001596  -78.09   <2e-16 ***
## PT08.S2.NMHC.  0.839425   0.002714  309.24   <2e-16 ***
## NOx.GT.        0.095550   0.002728   35.03   <2e-16 ***
## PT08.S3.NOx.   0.130176   0.003552   36.65   <2e-16 ***
## NO2.GT.       -0.051248   0.002500  -20.50   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01867 on 6936 degrees of freedom
## Multiple R-squared:  0.9748, Adjusted R-squared:  0.9748 
## F-statistic: 6.708e+04 on 4 and 6936 DF,  p-value: < 2.2e-16
X <- model.matrix(~PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx. + NO2.GT., data = air_norm)
XX <- t(X) %*% X
eigen <- eigen(XX)
CI <- sqrt(max(eigen$values) / min(eigen$values))
CI
## [1] 21.89312
ols_coll_diag(mod)
## Tolerance and Variance Inflation Factor
## ---------------------------------------
##       Variables Tolerance      VIF
## 1 PT08.S2.NMHC. 0.3251437 3.075563
## 2       NOx.GT. 0.3382152 2.956697
## 3  PT08.S3.NOx. 0.3497007 2.859588
## 4       NO2.GT. 0.3906213 2.560025
## 
## 
## Eigenvalue and Condition Index
## ------------------------------
##   Eigenvalue Condition Index    intercept PT08.S2.NMHC.     NOx.GT.
## 1 4.24470560        1.000000 0.0010378468   0.002942509 0.006032574
## 2 0.59765062        2.665018 0.0021545418   0.004855400 0.069713658
## 3 0.09187354        6.797177 0.0068155859   0.274595438 0.640598722
## 4 0.05225333        9.012946 0.0001823467   0.190118070 0.251256640
## 5 0.01351690       17.720879 0.9898096788   0.527488583 0.032398405
##   PT08.S3.NOx.      NO2.GT.
## 1  0.002602851 0.0031323195
## 2  0.067442792 0.0025800414
## 3  0.085012910 0.0006553564
## 4  0.035136354 0.8073370395
## 5  0.809805092 0.1862952433
ols_plot_resid_fit_spread(mod)

ols_plot_obs_fit(mod)

mod <-  lm(C6H6.GT.~CO.GT. + PT08.S4.NO2., data = air_norm)
summary(mod)
## 
## Call:
## lm(formula = C6H6.GT. ~ CO.GT. + PT08.S4.NO2., data = air_norm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16444 -0.01879 -0.00216  0.01556  0.76399 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.051025   0.001126  -45.33   <2e-16 ***
## CO.GT.        0.718581   0.004320  166.32   <2e-16 ***
## PT08.S4.NO2.  0.215265   0.003322   64.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03411 on 6938 degrees of freedom
## Multiple R-squared:  0.9159, Adjusted R-squared:  0.9158 
## F-statistic: 3.776e+04 on 2 and 6938 DF,  p-value: < 2.2e-16
X <- model.matrix(~CO.GT. + PT08.S4.NO2., data = air_norm)
XX <- t(X) %*% X
eigen <- eigen(XX)
CI <- sqrt(max(eigen$values) / min(eigen$values))
CI
## [1] 13.33846
ols_coll_diag(mod)
## Tolerance and Variance Inflation Factor
## ---------------------------------------
##      Variables Tolerance      VIF
## 1       CO.GT. 0.6020484 1.660996
## 2 PT08.S4.NO2. 0.6020484 1.660996
## 
## 
## Eigenvalue and Condition Index
## ------------------------------
##   Eigenvalue Condition Index  intercept     CO.GT. PT08.S4.NO2.
## 1 2.76742714        1.000000 0.01578747 0.02263258  0.010088266
## 2 0.18068900        3.913562 0.29140318 0.63351920  0.006558737
## 3 0.05188386        7.303347 0.69280934 0.34384822  0.983352997
ols_plot_resid_fit_spread(mod)

ols_plot_obs_fit(mod)

Separated data in 75:25% We can see that R^2 is close to 1 which says about linear dependence. P-value is close to 0 - independent variables explain the dynamics of the dependent variable.

set.seed(2)
sep <- sample.int(n = nrow(air_norm), size = floor(.75*nrow(air_norm)))
train <- air_norm[sep,]
test <- air_norm[-sep,]
train1 <- train[,c('C6H6.GT.', 'CO.GT.', 'PT08.S4.NO2.')]
test1 <- test[,c('C6H6.GT.', 'CO.GT.', 'PT08.S4.NO2.')]
mod1 <- lm(C6H6.GT.~CO.GT. + PT08.S4.NO2., data = train1)
a1 <- summary(mod1)
pred1 <- predict(mod1, newdata = test1)
test1$C6H6.GT._pred <- pred1
head(test1)
test1 <- rbindlist(list(test1[,c(2,3,1)], test1[,c(2,3,4)]))
train1$type <- 'train'
test1$type <- 'test'
test1[1:(nrow(test1)/2),4] <- 'real value'
all <- rbind(train1, test1)
plot_ly(data = all,
        z = ~C6H6.GT.,
        y = ~PT08.S4.NO2.,
        x = ~CO.GT., opacity = 0.7, color = ~type)%>%
  layout(title = paste("R2", round(a1$r.squared, 3),
                                  sep = ": "),
                            paste("pvalue", 2.2e-16, sep = ": "),
         xaxis = list(title = "CO.GT.",
                      zeroline = FALSE),
         yaxis = list(title = "PT08.S4.NO2.",
                      zeroline = FALSE),
         zaxis = list(title = "C6H6.GT.",
                      zeroline = FALSE))